Setup

# Import functions
root = "./"
source(file.path(root,"general_functions.R"))
import_parameters(params)
## **** __Utilized Cores__ **** = 2$subsetGenes
## [1] "protein_coding"
## 
## $subsetCells
## [1] 500
## 
## $resolution
## [1] 0.6
## 
## $resultsPath
## [1] "./Results"
## 
## $nCores
## [1] 2
## 
## $perplexity
## [1] 30
# load(file.path(resultsPath, "scRNAseq_results.RData"))
resultsPath <- "Results/subsetGenes-protein_coding__subsetCells-F__Resolution-0.2__perplexity-40__nCores-4"
load(file.path(resultsPath,"scRNAseq_results.RData"))

Load Libraries

library(Seurat)
library(cowplot)
library(ggplot2)
library(dplyr) 
library(data.table) 
library(readxl) 
library(reshape2)

library(GeneOverlap) # BiocManager::install("GeneOverlap")
library(monocle) # BiocManager::install("monocle") 
# BiocManager::install(c('DelayedArray', 'DelayedMatrixStats', 'org.Hs.eg.db', 'org.Mm.eg.db'))
library(org.Hs.eg.db)
library(garnett) # devtools::install_github("cole-trapnell-lab/garnett")
library(enrichR) #BiocManager::install("enrichR")

Identify Cell Types with Garnett + Monocle

Pre-trained PBMC Classifer

# Convert Seurat objt to CDS object 
mDAT <- monocle::importCDS(DAT,  import_all = T) 
# generate size factors for normalization later
mDAT <-  DESeq2::estimateSizeFactors(mDAT)
# Get pre-trained PBMC classifer 
load(file.path(root, "Data/Garnett/hsPBMC")) # Download from: https://cole-trapnell-lab.github.io/garnett/classifiers/hsPBMC  

mDAT <- garnett::classify_cells(mDAT, hsPBMC,
                           db = org.Hs.eg.db,
                           cluster_extend = TRUE,
                           cds_gene_id_type = "SYMBOL")
## Warning in doTryCatch(return(expr), name, parentenv, handler): The
## following genes used in the classifier are not present in the input CDS.
## Interpret with caution. ENSG00000174059The following genes used in the
## classifier are not present in the input CDS. Interpret with caution.
## ENSG00000007062The following genes used in the classifier are not present
## in the input CDS. Interpret with caution. ENSG00000157404The following
## genes used in the classifier are not present in the input CDS. Interpret
## with caution. ENSG00000185291
table(pData(mDAT)$cell_type)
## 
##         B cells           CD34+     CD4 T cells     CD8 T cells 
##              13              92               3               1 
## Dendritic cells       Monocytes        NK cells         T cells 
##             280           15025             801              16 
##         Unknown 
##            2913
table(pData(mDAT)$cluster_ext_type) 
## 
##         B cells           CD34+     CD4 T cells Dendritic cells 
##              12              17               3             165 
##       Monocytes        NK cells         T cells         Unknown 
##           17165             701              15            1066
# Get feature genes for each cell type
feature_genes <- garnett::get_feature_genes(classifier = hsPBMC,
                                   node = "root",
                                   db = org.Hs.eg.db,
                                   convert_ids = F)
head(feature_genes) 
##                      B cells       CD34+ Dendritic cells     Monocytes
## (Intercept)     -0.884989922  0.47694427    -0.124629734 -0.0301907756
## ENSG00000196154 -0.005499815 -0.01088859    -0.007978714  0.0056365451
## ENSG00000019582  0.030146634 -0.01959215     0.015232191  0.0001883507
## ENSG00000156738  2.193353690 -0.66191294    -1.104899464 -0.7393680112
## ENSG00000177455  2.056641938 -0.71851810    -1.941859710 -1.0448682467
## ENSG00000105369  1.800296145 -0.53835614     0.063380804 -0.1973728377
##                     NK cells      T cells     Unknown
## (Intercept)     -0.911657164  0.524757370 0.949765956
## ENSG00000196154 -0.001627233 -0.005835858 0.026193667
## ENSG00000019582 -0.005880096 -0.023473024 0.003378099
## ENSG00000156738 -0.096566144 -0.602417257 1.011810124
## ENSG00000177455  0.297279508  0.177790907 1.173533704
## ENSG00000105369 -0.618567693 -0.963141845 0.453761563
# Convert back to Seurat object & save results
DAT <- Seurat::AddMetaData(DAT, pData(mDAT)[c("garnett_cluster","cell_type","cluster_ext_type")])
write.csv(DAT@meta.data, file.path(resultsPath, "garnett_results.csv"), quote = F,row.names = F)

Biomarker Expression

markerList <- c("CD14", "FCGR3A")
markerData <- DAT@scale.data[row.names(DAT@scale.data) %in% markerList,] %>% t() %>% data.frame()  
## Efficiently merge using data.table
dt1 <- data.table(markerData, keep.rownames = "Cell", key = "Cell") %>% copy()
dt2 <- data.table(DAT@meta.data[c("cell_type","post_clustering")], keep.rownames = "Cell", key = "Cell") %>% copy()
markerDT <- dt1[dt2]
 
ggplot(data = markerDT, aes(x=CD14, y=FCGR3A) ) + 
  stat_density_2d(aes(fill = ..level..), geom = "polygon", colour="purple", bins = 100, size=.1) +
  geom_point(aes(color=as.factor(cell_type)), shape=16, stroke=0, size=2, alpha=.1) + 
  guides(colour = guide_legend(title="cell_type",override.aes = list(alpha = 1))) 

Plot tSNE

tSNE_metadata_plot <- function(var, labSize=12){ 
  # Metadata plot  
  p1 <- Seurat::TSNEPlot(DAT, do.return = T,  do.label = T,  group.by = var,label.size = labSize,
                 plot.title=paste(var), vector.friendly=T) +
    theme(legend.position = "top") + 
    scale_color_brewer( palette = "Dark2",  aesthetics = aes(alpha=.5)) 
     
  # t-SNE clusters plot
  p2 <- Seurat::TSNEPlot(DAT, do.return = T, do.label = T,label.size = labSize,
                 plot.title=paste("Unsupervised Clusters"), vector.friendly=T)  +
    theme(legend.position = "top") + 
    scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))  
  print(cowplot::plot_grid(p1,p2))
}   

Garnett cell_type

tSNE_metadata_plot("cell_type")

Garnett cluster_ext_type

tSNE_metadata_plot("cluster_ext_type")

mut

tSNE_metadata_plot("mut")

tSNE Heatmaps

FeaturePlot(DAT,features.plot =  c("CD14","FCGR3A", "LRRK2", "GBA"),  
            cols.use = c("grey","purple","magenta"), dark.theme = T, nCol = 2 )

FeatureHeatmap(DAT,  c("CD14","FCGR3A","LRRK2", "GBA"), pt.size = 0.5,
               cols.use = c("grey","purple"), plot.horiz =T) 

AD Gene Expression per Cluster

Q: Is AD-related gene expression more prevalent in classical, intermediate or non-classical monocyte subtypes?

ADgenes <- read_excel(file.path(root, "Data/AD_genes.xlsx"))
MonocyteSubtype.markers <- read.csv(file.path(root, resultsPath, "MonocyteSubtype.markers.csv"), row.names = 1)
MonocyteSubtype.markers_sig <- subset(MonocyteSubtype.markers, p_val_adj <= 0.05)

Test Gene Overlap

  • “The GeneOverlap class formulates the problem as testing whether two variables are independent, which can be represented as a contingency table, and then uses Fisher’s exact test to find the statistical significance.”
  • Low p-value means the overlap is highly significant.
  • Documentation
genomeSize <- dim(DAT@raw.data)[1]
list1 <- row.names(MonocyteSubtype.markers_sig)
list2 <- ADgenes$Gene
go.obj <- newGeneOverlap(listA = list1, listB = list2, genome.size = genomeSize )
go.obj <- testGeneOverlap(go.obj)
print(go.obj)
## Detailed information about this GeneOverlap object:
## listA size=205, e.g. FCGR3A RHOC CDKN1C
## listB size=21, e.g. SPI1 CD33 PILRA
## Intersection size=9, e.g. IFITM3 IFITM2 IFITM1
## Union size=217, e.g. FCGR3A RHOC CDKN1C
## Genome size=24914
## # Contingency Table:
##       notA inA
## notB 24697 196
## inB     12   9
## Overlapping p-value=3.9e-14
## Odds ratio=94.4
## Overlap tested using Fisher's exact test (alternative=greater)
## Jaccard Index=0.0
overlappingGenes <- getIntersection(go.obj)
percent_of_ADgenes <-  length(overlappingGenes) / length(ADgenes$Gene)*100
percent_of_DEGs <- length(overlappingGenes) / length(row.names(MonocyteSubtype.markers_sig))*100
AD_DEGs <- subset(ADgenes, Gene %in% overlappingGenes)

cat("\n",round(percent_of_ADgenes, 2),"% of the AD-risk genes (",length(overlappingGenes),"/",length(ADgenes$Gene),") are differentially expressed between Canonical vs. Intermediate monocyte subtypes.")
## 
##  42.86 % of the AD-risk genes ( 9 / 21 ) are differentially expressed between Canonical vs. Intermediate monocyte subtypes.
cat("\n",round(percent_of_DEGs, 2),"% of the DEGs between Canonical vs. Intermediate  monocyte subtypes (",
    length(overlappingGenes),"/",length(row.names(MonocyteSubtype.markers_sig)),") are in the AD-risk gene list.")
## 
##  4.39 % of the DEGs between Canonical vs. Intermediate  monocyte subtypes ( 9 / 205 ) are in the AD-risk gene list.

Percent of Cells that Express AD Genes

# percent_cells_expressing <- function(DAT, ADgenes, clusterList){ 
#   for(clust in clusterList){
#     cat("\nProcessing cluster",clust)
#     clustDAT <- SubsetData(DAT, subset.name = "post_clustering", accept.value = clust) 
#     # Seurat:::PercentAbove(x = DAT@data, threshold = 0) # Very inefficient
#     subDAT <- clustDAT@data[row.names(clustDAT@data ) %in% ADgenes$Gene,] 
#     for(gene in row.names(subDAT)){
#       cat(".")
#       ADgenes[ADgenes$Gene==gene,paste("cluster",clust,sep="")] <- mean(ifelse(subDAT[gene,] > 0, 1, 0))
#     }  
#   } 
#   ADgenes <- melt(ADgenes, id.vars = c("Gene","Category"), measure.vars = paste("cluster",clusterList,sep=""), 
#      variable.name = "Cluster",
#      value.name = "cellsExpressing")
#   ggplot(data=AD_DEGs, aes(x=Cluster, y=cellsExpressing, fill=Gene)) + geom_col(position="dodge") + 
#     labs(title = "Fraction of Cells Expressing AD-associated Genes", y="Fraction of Cells") +
#      scale_x_discrete(labels=c("Cluster0\n(Canonical)", "Cluster1\n(Intermediate)")) 
#   return(ADgenes)
# } 

MonocyteSubtype.markers_sig$Gene <- row.names(MonocyteSubtype.markers_sig)
AD_pct <- melt(subset(MonocyteSubtype.markers_sig, Gene%in%ADgenes$Gene), 
               id.vars = c("Gene","avg_logFC"), measure.vars = c("pct.2", "pct.1"), 
               variable.name = "Group", value.name = "pct") 
# Fraction Cells
ggplot(data=AD_pct, aes(x=Group, y=pct, fill=Gene)) + geom_col(position="dodge") + 
  labs(title = "Fraction of Cells Expressing AD-associated Genes", y="Fraction of Cells", x="Monocyte Subtype") +
   scale_x_discrete(labels=c("Cluster0\n(Canonical)", "Cluster1\n(Intermediate)")) 

# LogFC
ggplot(data=AD_pct, aes(x=Gene, y=avg_logFC, fill=Gene)) + geom_col(position="dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") +
   labs(title = "logFC of AD-associated DEGs:\nCanonical vs. Intermediate Monocytes", y="avg_logFC", x="Monocyte Subtype") 

Expression Heatmap

clustDAT <- SubsetData(DAT, subset.name = "post_clustering", accept.value = c(0,1), do.scale = F) 
markerDF  <- get_markerDT(clustDAT, markerList = ADgenes$Gene, rawData = T) 
markerMatrix <- reshape2::acast(markerDF, Gene~post_clustering, value.var="Expression",
                                fun.aggregate = mean, drop = F, fill = 0)
# Heatmap.2
my_palette <- colorRampPalette(c("purple", "black", "cyan"))(n = 1000)
gplots::heatmap.2(markerMatrix, xlab = "Cluster", dendrogram = "row",
                  col = my_palette, tracecol = "gray", srtCol = 0, offsetCol=1.5, vline=T, 
                  trace='none', key.title=NA, key.ylab = "Expression", colsep=T, sepwidth = 0.01) 

DEG Enrichment

enrichr_dbs <- c("KEGG_2018", "Reactome_2016",
                 "GO_Biological_Process_2018", "GO_Molecular_Function_2018", "GO_Cellular_Component_2018", 
                 "Rare_Diseases_AutoRIF_ARCHS4_Predictions", "Human_Gene_Atlas")
# createDT(enrichR::listEnrichrDbs(), "Enrichr Databases")
 
geneList <- data.frame(Gene=row.names(MonocyteSubtype.markers), 
     Weight=scales::rescale(length(MonocyteSubtype.markers$p_val_adj):1))

results <- enrichr(genes = geneList, databases = enrichr_dbs ) 

Uploading data to Enrichr… Done. Querying KEGG_2018… Done. Querying Reactome_2016… Done. Querying GO_Biological_Process_2018… Done. Querying GO_Molecular_Function_2018… Done. Querying GO_Cellular_Component_2018… Done. Querying Rare_Diseases_AutoRIF_ARCHS4_Predictions… Done. Querying Human_Gene_Atlas… Done. Parsing results… Done.

for (db in enrichr_dbs){
  cat('\n')
  cat("### ",db,"\n")  
  # res <- subset(results[[db]], Adjusted.P.value<=0.05)
  createDT_html(results[[db]], paste("Enrichr Results:",db))
  cat('\n')
}  

KEGG_2018

Reactome_2016

GO_Biological_Process_2018

GO_Molecular_Function_2018

GO_Cellular_Component_2018

Rare_Diseases_AutoRIF_ARCHS4_Predictions

Human_Gene_Atlas